Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

- DESeq2: http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

library(data.table)
library(tidyverse)
library(rmarkdown)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(tximport)
library(Rsubread)
library(DESeq2)
library(UpSetR)
library(ensembldb)
library(gridExtra)

Setting AnnotationHub

Assign your species of interest

DB <- "EnsDb"
AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))  # Bring annotation DB

Running AnnotationHub

ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      # Filter annotation of interest


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

# Save the dataset of interest
AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 


# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)    # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
##                 TXID             GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001    Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003     Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875    Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028    Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583     Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049     Apoh

Metadata setting

# This code chunk needs to be written by yourself 

# Define sample names 
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))  

# Aligner names
Aligners <- c("Salmon", "STAR", "HISAT2")

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)

# Set a function for file paths
path.fn <- function(head, tail) { 

    vec <- c(paste0("../", 
                    head,    # head = e.g. "hisat2", "star", or "salmon"
                    "_output/",
                    SampleNames,
                    tail))   # tail = file name after SampleNames 

    return(vec)
}


# Define .sf file path
sf <- path.fn("salmon",
              ".salmon_quant/quant.sf")


# Define STAR file path
star <- path.fn("star", 
                "Aligned.sortedByCoord.out.bam")

# Define HISAT2 file path
hisat <- path.fn("hisat2",
                 ".sorted.bam")


# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Salmon_path=sf,
                       STAR_path=star, 
                       HISAT2_path=hisat)

# Assign row names with sample names
rownames(metadata) <- SampleNames


# Explore the metadata
print(metadata)
##            Sample Group                                     Salmon_path
## WT-rep1   WT-rep1    WT  ../salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  ../salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  ../salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO ../salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO ../salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO ../salmon_output/cKO-rep3.salmon_quant/quant.sf
##                                                     STAR_path
## WT-rep1   ../star_output/WT-rep1Aligned.sortedByCoord.out.bam
## WT-rep2   ../star_output/WT-rep2Aligned.sortedByCoord.out.bam
## WT-rep3   ../star_output/WT-rep3Aligned.sortedByCoord.out.bam
## cKO-rep1 ../star_output/cKO-rep1Aligned.sortedByCoord.out.bam
## cKO-rep2 ../star_output/cKO-rep2Aligned.sortedByCoord.out.bam
## cKO-rep3 ../star_output/cKO-rep3Aligned.sortedByCoord.out.bam
##                                   HISAT2_path
## WT-rep1   ../hisat2_output/WT-rep1.sorted.bam
## WT-rep2   ../hisat2_output/WT-rep2.sorted.bam
## WT-rep3   ../hisat2_output/WT-rep3.sorted.bam
## cKO-rep1 ../hisat2_output/cKO-rep1.sorted.bam
## cKO-rep2 ../hisat2_output/cKO-rep2.sorted.bam
## cKO-rep3 ../hisat2_output/cKO-rep3.sorted.bam

featureCounts parameter setting

# "mm10", "mm9", "hg38", or "hg19"
# cf. mm10 = GRCm38
annot.inbuilt <- "mm10" 

# GTF file path
annot.ext <- "../../mouse_reference/gencode.m25.primary_assembly.annotation.gtf"

# annotation type:
# e.g.: "gene_id", "transcript_id", or "gene_name"
GTF.attrType <- "gene_id"

# Number of cores 
nthread <- 16 

# Set a function importing counts from BAM files with featureCounts()
fcounts.fn <- function(vec) {

    fc <- featureCounts(files=vec,   # a vector assigning BAM file paths
                         annot.inbuilt=annot.inbuilt,
                         annot.ext=annot.ext,
                         GTF.attrType=GTF.attrType,
                         isGTFAnnotationFile=T,
                         nthread=nthread, 
                         isPairedEnd=F, # Set this parameter correctly 
                         verbose=T)

    return(fc$counts)
}

Importing counts

Importing Salmon counts

Note:

txi1 <- tximport(…, txOut=F)

txi2 <- tximport(…, txOut=T)

txi2 <- summarizedToGene(…)

counts extracted from txi1 and txi2 are the same

# Import gene level summarized counts 
salmon.txi <- tximport(metadata$Salmon_path,
                       type = "salmon",
                       tx2gene=AnnoDb,
                       ignoreTxVersion=T, 
                       txOut=F)       # TRUE for transcript level, FALSE for gene level 

# Extract the counts and save as a data frame
salmon.counts <- salmon.txi$counts

# Explore the salmon count data frame
head(salmon.counts)
##                        [,1]     [,2]     [,3]    [,4]    [,5]     [,6]
## ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## ENSMUSG00000000003    0.000    0.000    0.000    0.00    0.00    0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## ENSMUSG00000000031   33.000   45.000   58.000   12.00   28.00   70.000
## ENSMUSG00000000037  119.001   90.001   82.000  165.00  129.00  100.000
## ENSMUSG00000000049    0.000    0.000    0.000    1.00    2.00    0.000
dim(salmon.counts)
## [1] 51956     6
summary(salmon.counts)
##        V1               V2                 V3                 V4          
##  Min.   :     0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0   Median :     0.0   Median :     0.0   Median :     0.0  
##  Mean   :   721   Mean   :   642.5   Mean   :   627.2   Mean   :   658.2  
##  3rd Qu.:    22   3rd Qu.:    20.0   3rd Qu.:    20.0   3rd Qu.:    21.2  
##  Max.   :357534   Max.   :322612.0   Max.   :318293.5   Max.   :343971.6  
##        V5                 V6          
##  Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0  
##  Mean   :   662.9   Mean   :   568.5  
##  3rd Qu.:    20.1   3rd Qu.:    18.0  
##  Max.   :337557.2   Max.   :305036.9

Importing STAR counts

# Extract counts by running featureCounts() 
star.counts <- fcounts.fn(metadata$STAR_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep2Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep3Aligned.sortedByCoord.out.bam             ||
## ||                           cKO-rep1Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep2Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep3Aligned.sortedByCoord.out.bam            ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 70746461                                             ||
## ||    Successfully assigned alignments : 55979059 (79.1%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63394034                                             ||
## ||    Successfully assigned alignments : 50159175 (79.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 61430033                                             ||
## ||    Successfully assigned alignments : 48522991 (79.0%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 64533984                                             ||
## ||    Successfully assigned alignments : 51163980 (79.3%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 65134349                                             ||
## ||    Successfully assigned alignments : 51790758 (79.5%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56200709                                             ||
## ||    Successfully assigned alignments : 44128747 (78.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the STAR count data frame
head(star.counts)
##                      WT-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    1
## ENSMUSG00000104017.1                                    0
##                      WT-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    2
## ENSMUSG00000104017.1                                    0
##                      WT-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    0
## ENSMUSG00000104017.1                                    2
##                      cKO-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     1
##                      cKO-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
##                      cKO-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     1
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     1
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
dim(star.counts)
## [1] 55487     6
summary(star.counts)
##  WT-rep1Aligned.sortedByCoord.out.bam WT-rep2Aligned.sortedByCoord.out.bam
##  Min.   :     0                       Min.   :     0                      
##  1st Qu.:     0                       1st Qu.:     0                      
##  Median :     1                       Median :     1                      
##  Mean   :  1009                       Mean   :   904                      
##  3rd Qu.:    86                       3rd Qu.:    78                      
##  Max.   :811870                       Max.   :470203                      
##  WT-rep3Aligned.sortedByCoord.out.bam cKO-rep1Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                     Min.   :     0.0                     
##  1st Qu.:     0.0                     1st Qu.:     0.0                     
##  Median :     1.0                     Median :     1.0                     
##  Mean   :   874.5                     Mean   :   922.1                     
##  3rd Qu.:    77.0                     3rd Qu.:    82.0                     
##  Max.   :388953.0                     Max.   :459595.0                     
##  cKO-rep2Aligned.sortedByCoord.out.bam cKO-rep3Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                      Min.   :     0.0                     
##  1st Qu.:     0.0                      1st Qu.:     0.0                     
##  Median :     1.0                      Median :     1.0                     
##  Mean   :   933.4                      Mean   :   795.3                     
##  3rd Qu.:    81.0                      3rd Qu.:    70.0                     
##  Max.   :412338.0                      Max.   :390466.0

Importing HISAT2 counts

# Extract counts by running featureCounts() 
hisat2.counts <- fcounts.fn(metadata$HISAT2_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1.sorted.bam                               ||
## ||                           WT-rep2.sorted.bam                               ||
## ||                           WT-rep3.sorted.bam                               ||
## ||                           cKO-rep1.sorted.bam                              ||
## ||                           cKO-rep2.sorted.bam                              ||
## ||                           cKO-rep3.sorted.bam                              ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63363619                                             ||
## ||    Successfully assigned alignments : 48324485 (76.3%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56580014                                             ||
## ||    Successfully assigned alignments : 43217333 (76.4%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 54969075                                             ||
## ||    Successfully assigned alignments : 41849987 (76.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 57670244                                             ||
## ||    Successfully assigned alignments : 44111169 (76.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 58054098                                             ||
## ||    Successfully assigned alignments : 44514481 (76.7%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 50205194                                             ||
## ||    Successfully assigned alignments : 38062143 (75.8%)                     ||
## ||    Running time : 0.05 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the HISAT2 count data frame
head(hisat2.counts)
##                      WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam
## ENSMUSG00000102693.1                  0                  0                  0
## ENSMUSG00000064842.1                  0                  0                  0
## ENSMUSG00000051951.5                  0                  0                  0
## ENSMUSG00000102851.1                  0                  0                  0
## ENSMUSG00000103377.1                  1                  2                  0
## ENSMUSG00000104017.1                  0                  0                  2
##                      cKO-rep1.sorted.bam cKO-rep2.sorted.bam
## ENSMUSG00000102693.1                   0                   0
## ENSMUSG00000064842.1                   0                   0
## ENSMUSG00000051951.5                   0                   0
## ENSMUSG00000102851.1                   0                   0
## ENSMUSG00000103377.1                   0                   0
## ENSMUSG00000104017.1                   1                   0
##                      cKO-rep3.sorted.bam
## ENSMUSG00000102693.1                   1
## ENSMUSG00000064842.1                   0
## ENSMUSG00000051951.5                   0
## ENSMUSG00000102851.1                   1
## ENSMUSG00000103377.1                   0
## ENSMUSG00000104017.1                   0
dim(hisat2.counts)
## [1] 55487     6
summary(hisat2.counts)
##  WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam cKO-rep1.sorted.bam
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   
##  Median :     1.0   Median :     1.0   Median :     1.0   Median :     1.0   
##  Mean   :   870.9   Mean   :   778.9   Mean   :   754.2   Mean   :   795.0   
##  3rd Qu.:    85.0   3rd Qu.:    77.0   3rd Qu.:    76.0   3rd Qu.:    81.5   
##  Max.   :755391.0   Max.   :436946.0   Max.   :361243.0   Max.   :428302.0   
##  cKO-rep2.sorted.bam cKO-rep3.sorted.bam
##  Min.   :     0.0    Min.   :     0     
##  1st Qu.:     0.0    1st Qu.:     0     
##  Median :     1.0    Median :     1     
##  Mean   :   802.3    Mean   :   686     
##  3rd Qu.:    81.0    3rd Qu.:    69     
##  Max.   :383887.0    Max.   :363272

Data cleaning: sample and gene annotation

countList <- list(salmon.counts, 
                  star.counts,
                  hisat2.counts)

# Assign names of the count data frames in the count list
names(countList) <- Aligners

# Set a function cleaning the count data frame
clean.fn <- function(df) {

    # Convert to a data frame
    df <- as.data.frame(df)

    # Assign column names
    names(df) <- SampleNames

    # Bring row names to a column
    df <- df %>% rownames_to_column(var="GENEID")

    return(df)
}


# Set a function to drop GENEID version
clean.annotation.fn <- function(df) {

    # Re-annotate without version specification
    df <- separate(df, "GENEID", c("GENEID", "Version"))

    # Remove version column
    df <- df[, colnames(df) != "Version"]

    return(df)
}

# Move GENEID to a column
for (x in Aligners) {

    countList[[x]] <- clean.fn(countList[[x]])

}


# Remove version of GENEID and duplicated rows in STAR & HISAT2 count tables
for (x in Aligners) {

    countList[[x]] <- clean.annotation.fn(countList[[x]]) %>% 

        distinct()

}



# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID  WT-rep1  WT-rep2  WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001 6492.000 5822.000 5614.000  6040.00  6142.00 5437.000
## 2 ENSMUSG00000000003    0.000    0.000    0.000     0.00     0.00    0.000
## 3 ENSMUSG00000000028 3904.066 3567.989 3714.192  3410.46  3463.04 3142.454
## 4 ENSMUSG00000000031   33.000   45.000   58.000    12.00    28.00   70.000
## 5 ENSMUSG00000000037  119.001   90.001   82.000   165.00   129.00  100.000
## 6 ENSMUSG00000000049    0.000    0.000    0.000     1.00     2.00    0.000
head(countList[[2]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
head(countList[[3]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
dim(countList[[1]])
## [1] 51956     7
dim(countList[[2]])
## [1] 55487     7
dim(countList[[3]])
## [1] 55487     7
sum(duplicated(countList[[1]]))
## [1] 0
sum(duplicated(countList[[2]]))
## [1] 0
sum(duplicated(countList[[3]]))
## [1] 0
# Convert Salmon counts to integers 
countList[["Salmon"]] <- cbind(GENEID=countList[["Salmon"]][, "GENEID"],
                               round(countList[["Salmon"]][, 
                               colnames(countList[["Salmon"]]) %in% SampleNames]))

# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## 2 ENSMUSG00000000003       0       0       0        0        0        0
## 3 ENSMUSG00000000028    3904    3568    3714     3410     3463     3142
## 4 ENSMUSG00000000031      33      45      58       12       28       70
## 5 ENSMUSG00000000037     119      90      82      165      129      100
## 6 ENSMUSG00000000049       0       0       0        1        2        0

Plotting sequencing depth

Number of total counts per sample

# Set a function generating a data frame with sequencing depth
seq.depth.fn <- function(df, aligner) {

    seqdf <- as.data.frame(colSums(df[, SampleNames])) %>% 
        rownames_to_column (var="Sample") %>% 
        mutate(Aligner=aligner)

    names(seqdf) <- c("Sample", "Count", "Aligner")

    return(seqdf)
}

# Set a function for a bar plot comparing values
comparing.barplot.fn <- function(df, yval, title, ytitle) {

    ggplot(df, 
       aes(x=Sample, y=yval, group=Aligner, fill=Aligner)) +
    geom_bar(stat="identity", position="dodge") +
    theme_bw() +
    ggtitle(title) + 
    ylab(ytitle)

}




# Initialize the seq depth data frame with the first aligner
seq.depth.df <- seq.depth.fn(countList[[1]], Aligners[1])

# Extend the seq depth data frame with the rest of aligners
for (x in Aligners) {

    if (x %in% Aligners[2:length(Aligners)]) {

        seq.depth.df <- rbind(seq.depth.df, 
                              seq.depth.fn(countList[[x]], x))
    }
}

# Explore how the data frame 
print(seq.depth.df)
##      Sample    Count Aligner
## 1   WT-rep1 37460304  Salmon
## 2   WT-rep2 33384135  Salmon
## 3   WT-rep3 32585959  Salmon
## 4  cKO-rep1 34197855  Salmon
## 5  cKO-rep2 34442159  Salmon
## 6  cKO-rep3 29538115  Salmon
## 7   WT-rep1 55979059    STAR
## 8   WT-rep2 50159175    STAR
## 9   WT-rep3 48522991    STAR
## 10 cKO-rep1 51163980    STAR
## 11 cKO-rep2 51790758    STAR
## 12 cKO-rep3 44128747    STAR
## 13  WT-rep1 48324485  HISAT2
## 14  WT-rep2 43217333  HISAT2
## 15  WT-rep3 41849987  HISAT2
## 16 cKO-rep1 44111169  HISAT2
## 17 cKO-rep2 44514481  HISAT2
## 18 cKO-rep3 38062143  HISAT2
summary(seq.depth.df)
##     Sample              Count            Aligner         
##  Length:18          Min.   :29538115   Length:18         
##  Class :character   1st Qu.:35196695   Class :character  
##  Mode  :character   Median :43664251   Mode  :character  
##                     Mean   :42412935                     
##                     3rd Qu.:48473364                     
##                     Max.   :55979059
# Convert character vectors to factors
seq.depth.df$Sample <- factor(seq.depth.df$Sample, 
                              levels=SampleNames)
seq.depth.df$Aligner <- factor(seq.depth.df$Aligner, 
                               levels=Aligners)

# Create a plot presenting sequencing depth
comparing.barplot.fn(seq.depth.df, 
                     seq.depth.df$Count, 
                     "Sequencing Depth by Sample and Aligner", 
                     "Count")

Generating DESeq2 objects

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Initialize new lists for storing dds objects
ddsList <- countList

# Initialize new lists for storing vsd objects
vsdList <- countList


for (x in Aligners) {

    # Create a count matrix from the count data frame 
    m <- countList[[x]][, colnames(countList[[x]]) != "GENEID"] %>% 
        as.matrix()

    # Assigne row names
    rownames(m) <- countList[[x]]$GENEID

    # Generate a DESeq2 object
    ddsList[[x]] <- DESeqDataSetFromMatrix(m, 
                                           colData=metadata, 
                                           design=~Group) 

    # Conduct vst
    vsdList[[x]] <- varianceStabilizingTransformation(ddsList[[x]], 
                                                      blind=TRUE) 
}

# Explore generated objects
summary(ddsList)
##        Length Class        Mode
## Salmon 51956  DESeqDataSet S4  
## STAR   55487  DESeqDataSet S4  
## HISAT2 55487  DESeqDataSet S4
summary(vsdList)
##        Length Class          Mode
## Salmon 51956  DESeqTransform S4  
## STAR   55487  DESeqTransform S4  
## HISAT2 55487  DESeqTransform S4

Estimating size factors

- black dashed line: size factor = 1

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Calculate and add size factors to the DEseq object
for (x in Aligners) {

    ddsList[[x]] <- estimateSizeFactors(ddsList[[x]])

}

# Set a function summarizing size factors by aligner to a data frame
sfactor.fn <- function(df, aligner) {

    sizefactor <- as.data.frame(round(sizeFactors(df), 3)) %>%
        rownames_to_column(var="Sample") %>%
        mutate(Aligner=aligner)

    names(sizefactor) <- c("Sample", "Size_Factor", "Aligner")

    return(sizefactor)

}

# Initialize a data frame with the first aligner 
size.factor.df <- sfactor.fn(ddsList[[1]], Aligners[1])


for (x in Aligners) {

    if (x != Aligners[1]) {

        size.factor.df <- rbind(size.factor.df, 
                                sfactor.fn(ddsList[[x]], x))
    }
}


# Explore the data frame
print(size.factor.df)
##      Sample Size_Factor Aligner
## 1   WT-rep1       1.110  Salmon
## 2   WT-rep2       0.998  Salmon
## 3   WT-rep3       0.980  Salmon
## 4  cKO-rep1       1.020  Salmon
## 5  cKO-rep2       1.034  Salmon
## 6  cKO-rep3       0.894  Salmon
## 7   WT-rep1       1.107    STAR
## 8   WT-rep2       1.000    STAR
## 9   WT-rep3       0.977    STAR
## 10 cKO-rep1       1.019    STAR
## 11 cKO-rep2       1.038    STAR
## 12 cKO-rep3       0.891    STAR
## 13  WT-rep1       1.107  HISAT2
## 14  WT-rep2       1.000  HISAT2
## 15  WT-rep3       0.976  HISAT2
## 16 cKO-rep1       1.019  HISAT2
## 17 cKO-rep2       1.038  HISAT2
## 18 cKO-rep3       0.891  HISAT2
# Convert character vectors to factors
size.factor.df$Sample <- factor(size.factor.df$Sample, 
                              levels=SampleNames)
size.factor.df$Aligner <- factor(size.factor.df$Aligner, 
                               levels=Aligners)

# Plot calculated size factors
comparing.barplot.fn(size.factor.df, 
                     size.factor.df$Size_Factor,  
                     "Size Factors by Aligner and Sample", 
                     "Size Factor") + geom_hline(yintercept=1, linetype="dashed", color="black", size=1)

Estimating dispersion and Wald test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

for (x in Aligners) {

    # Dispersion
    ddsList[[x]] <- estimateDispersions(ddsList[[x]])
    
    # Wald test
    ddsList[[x]] <- nbinomWaldTest(ddsList[[x]])

}


# Explore generated data in the dds object 
ddsList[[1]]
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(6): Sample Group ... HISAT2_path sizeFactor

Sample QC: Principal Component Analysis (PCA)

Identifies source of variation and sample outliers

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1]


# Set a function for sample pca
qcpca.fn <- function(obj, title) {

    plotPCA(obj,
        intgroup=GroupOfInterest,
        returnData=FALSE) + theme_bw() + ggtitle(paste("PCA:", title)) 

}

# Print the plots
qcpca.fn(vsdList[[1]], Aligners[1]) 

qcpca.fn(vsdList[[2]], Aligners[2])

qcpca.fn(vsdList[[3]], Aligners[3]) 

Sample QC: Sample Correlation Heatmap

Identifies distance between samples & correlation in a group

# Heatmap annotation
HeatmapAnno <- metadata[, c("Sample", "Group")]

# Set a function generating a correlation heatmap
cheatmap.fn <- function(df, title) {

    # Extract a normalized count matrix
    vm <- assay(df)

    # Generate a correlation matrix
    cm <- cor(vm)

    # Generate a heatmap
    pheatmap(cm, 
             annotation=HeatmapAnno, 
             main=paste("Sample Correlation Heatmap:", title))
}


# Print the heatmaps
cheatmap.fn(vsdList[[1]], Aligners[1])

cheatmap.fn(vsdList[[2]], Aligners[2])

cheatmap.fn(vsdList[[3]], Aligners[3])

Running DE analysis

# Run DESeq 
for (x in Aligners) {

    ddsList[[x]] <- DESeq(ddsList[[x]])
    # Check result names 
    ResNames <- resultsNames(ddsList[[x]])
    print(ResNames)

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Set a function plotting dispersion
dplot.fn <- function(dds, title) {

    plotDispEsts(dds, 
                 main=paste("Dispersion over Counts:", title))
}

# Plot dispersion patterns
dplot.fn(ddsList[[1]], Aligners[1])

dplot.fn(ddsList[[2]], Aligners[2])

dplot.fn(ddsList[[3]], Aligners[3])

# Do they fit well with the DESeq2 estimation model?

Setting how to extract fold-change results

Change variables below

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold alpha
alpha=0.1

# Set the coefficients to compare 
Coef <- ResNames[-1]
print(Coef) 
## [1] "Group_cKO_vs_WT"
# Set a function to clean a result table 
lfctable.fn <- function(df) {
    df <- df %>% 
        rownames_to_column(var="GENEID") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   paste("<", alpha), 
                                   paste(">", alpha))) 
    return(df)
}

# Set a function extracting results
extract.lfc.fn <- function(dds) {

    res <- results(dds, contrast=Contrast, alpha=alpha)
    lfctable.fn(as.data.frame(res))

    return(lfctable.fn(as.data.frame(res)))


}

Extracting log2FoldChanges

You can change alpha depending on your interest of FDR level

Shrinkage is NOT applied in this analysis

# Initialize a list storing lfc data frames
lfcList <- countList

# Extract DE results
# The Contrast variable was defined in the previous chunk
# Extraction with no shrinkage
# alpha: FDR threshold
for (x in Aligners) {

    lfcList[[x]] <- extract.lfc.fn(ddsList[[x]]) %>% mutate(Alignment=x)

    print(head(lfcList[[x]]))

}
##               GENEID     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5892.4376208    -0.04367298 0.05134311 -0.8506103
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3514.8301315     0.09209382 0.06403838  1.4381034
## 4 ENSMUSG00000000031   41.8581466     0.19865308 0.59900127  0.3316405
## 5 ENSMUSG00000000037  113.2455289    -0.50352734 0.23268971 -2.1639433
## 6 ENSMUSG00000000049    0.4857333    -2.44912383 3.56194529 -0.6875804
##      pvalue      padj   FDR Alignment
## 1 0.3949859 0.7963511 > 0.1    Salmon
## 2        NA        NA > 0.1    Salmon
## 3 0.1504047 0.5392314 > 0.1    Salmon
## 4 0.7401607 0.9435540 > 0.1    Salmon
## 5 0.0304687 0.2269216 > 0.1    Salmon
## 6 0.4917170        NA > 0.1    Salmon
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 5 ENSMUSG00000103377 0.4837499      2.3680696 3.152915  0.7510730 0.4526087
## 6 ENSMUSG00000104017 0.5047301      0.8868952 3.201764  0.2770020 0.7817786
##   padj   FDR Alignment
## 1   NA > 0.1      STAR
## 2   NA > 0.1      STAR
## 3   NA > 0.1      STAR
## 4   NA > 0.1      STAR
## 5   NA > 0.1      STAR
## 6   NA > 0.1      STAR
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 5 ENSMUSG00000103377 0.4838032      2.3677771 3.123686  0.7580073 0.4484466
## 6 ENSMUSG00000104017 0.5050522      0.8868523 3.167217  0.2800099 0.7794699
##   padj   FDR Alignment
## 1   NA > 0.1    HISAT2
## 2   NA > 0.1    HISAT2
## 3   NA > 0.1    HISAT2
## 4   NA > 0.1    HISAT2
## 5   NA > 0.1    HISAT2
## 6   NA > 0.1    HISAT2
# Initialize a data frame storing total lfc results across the aligners
lfc.dataframe <- lfcList[[1]] 

for (x in Aligners[2:length(Aligners)]) {

    lfc.dataframe <- rbind(lfc.dataframe, 
                           lfcList[[x]])

}


lfc.dataframe$Alignment <- factor(lfc.dataframe$Alignment, 
                                  levels=Aligners)

Exploring distribution of false discovery rate (FDR)

Black dashed line: FDR = 0.1

# Plot distribution of FDR 
ggplot(lfc.dataframe, 
       aes(x=padj, y=..count.., color=Alignment)) + 
    geom_density(size=1) + 
    theme_bw() + 
    scale_x_log10() + 
    ggtitle("Distribution of False Discovery Rate (FDR) by Aligner") + 
    ylab("Count") +
    xlim(0.00001, 1) + 
    geom_vline(xintercept=alpha, 
               color="black", 
               size=1, linetype="dashed") + scale_x_continuous(breaks=seq(0, 1, by=0.1))

Presenting distribution of log2FoldChange

Black: total genes (padj =/= NA)

Colored: genes above or below FDR=0.1

valid.lfc.df <- subset(lfc.dataframe, FDR == "< 0.1")

ggplot(valid.lfc.df,
       aes(x=log2FoldChange,
           y=..count.., 
           color=Alignment)) +
geom_density(size=1) + 
theme_bw() + 
geom_vline(xintercept=c(-1, 1), 
           linetype="dashed", color="black", size=1) + 
ggtitle("Distribution of log2FoldChange Values by Aligner (FDR < 0.1)") +
ylab("Count") + 
xlim(-10, 10)    # Change xlim by datatype

Exploring mean-difference with an MA plot

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

# Set ylim: has to adjusted by users depending on data 
yl <- c(-10, 10)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Create MA plots by Aligner
ggplot(lfc.dataframe, aes(x=baseMean, y=log2FoldChange, color=FDR)) + 
        geom_point() + 
        facet_grid(~Alignment) +
        scale_x_log10() + 
        theme_bw() + 
        scale_color_manual(values=c("blue", "grey")) + 
        ggtitle(paste("MA plot")) + 
        ylim(yl[1], yl[2]) + 
        theme(strip.text.x=element_text(size=10)) +
        geom_hline(yintercept=c(mLog[1], mLog[2]), 
                   linetype="dashed", color="red") 

Exploring expression profiling with normalized count data

- Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange

- The heatmaps display z-scores of the normalized counts

- In this analysis, mLog = 1

- References: Harvard Chan Bioinformatics Core workshop, DESeq2 doc “Heatmap of the count matrix”

# Initialize a list 
heatmap.df.List <- lfcList

# Filter genes with FDR < alpha and absolute log2FoldChange > 1
for (x in Aligners) {

    # Set a logical vector filtering FDR below alpha
    is.fdr.valid <- lfcList[[x]]$FDR == paste("<", alpha)

    # Set a logical vector filtering absolute lfc above 1 
    is.lfc.large <- abs(lfcList[[x]]$log2FoldChange) > mLog[2]

    # Extract total normalized counts
    norm.counts <- counts(ddsList[[x]], normalized=T)

    # Save filtered genes only from the normalized count data
    heatmap.df.List[[x]] <- norm.counts[is.fdr.valid & is.lfc.large,]

}

# Explore the cleaned data frames 
head(heatmap.df.List[[1]])
##                     WT-rep1  WT-rep2  WT-rep3   cKO-rep1   cKO-rep2   cKO-rep3
## ENSMUSG00000000125 17.11371 17.03764 12.24046  19.617050  81.208977  48.112902
## ENSMUSG00000000303 23.41876 21.04649 40.80153   6.865968   4.833868  10.070142
## ENSMUSG00000000673 28.82309 29.06421 31.62119  11.770230  15.468377  11.189047
## ENSMUSG00000000686 85.56853 80.17712 84.66318  39.234100  35.770621  32.448236
## ENSMUSG00000000869 60.34833 64.14170 15.30058  22.559608   8.700962   6.713428
## ENSMUSG00000000957 24.31948 24.05314 15.30058 100.046956 164.351502 111.890470
head(heatmap.df.List[[2]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  151.77009   57.97408   16.37872   7.848327   7.705361
## ENSMUSG00000041872  131.89543   62.97184   16.37872  14.715613  14.447553
## ENSMUSG00000047180 1469.82104 1545.30903 1586.68816 626.885122 470.027043
## ENSMUSG00000026072   49.68664   46.97899   57.32551  10.791450   5.779021
## ENSMUSG00000026070  542.93943  570.74480  901.85308 417.923415 242.718883
## ENSMUSG00000025969   33.42556   36.98346   30.71009  60.824535  78.016784
##                      cKO-rep3
## ENSMUSG00000025929   5.611470
## ENSMUSG00000041872   8.978352
## ENSMUSG00000047180 622.873185
## ENSMUSG00000026072  14.589822
## ENSMUSG00000026070 253.638450
## ENSMUSG00000025969  65.093054
head(heatmap.df.List[[3]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  150.82592   57.99044   16.38913   7.853379   7.708966
## ENSMUSG00000041872  127.34404   58.99027   12.29185  13.743413  15.417932
## ENSMUSG00000047180 1458.58602 1537.74640 1576.42924 623.361945 468.319699
## ENSMUSG00000026072   49.67321   45.99242   57.36195  10.798396   5.781725
## ENSMUSG00000026070  525.63286  555.90832  872.72106 409.357372 235.123470
## ENSMUSG00000025969   34.31967   37.99373   32.77826  57.918669  82.871387
##                      cKO-rep3
## ENSMUSG00000025929   5.611066
## ENSMUSG00000041872   8.977705
## ENSMUSG00000047180 617.217229
## ENSMUSG00000026072  14.588771
## ENSMUSG00000026070 246.886892
## ENSMUSG00000025969  70.699428
dim(heatmap.df.List[[1]])
## [1] 237   6
dim(heatmap.df.List[[2]])
## [1] 236   6
dim(heatmap.df.List[[3]])
## [1] 240   6
pheatmap(heatmap.df.List[[3]], 
         annotation=HeatmapAnno,
         scale="row",
         show_rownames=F)

# Set a function creating a profiling heatmap
profile.heatmap.fn <- function(df, title) {

    pheatmap(df, 
             annotation=HeatmapAnno, 
             scale="row", 
             show_rownames=F,
             main=paste("Expression Profiling by", title, "(FDR < 0.1, absolute log2FoldChange > 1)"))
}

# Print the expression heatmaps
profile.heatmap.fn(heatmap.df.List[[Aligners[1]]], Aligners[1])

profile.heatmap.fn(heatmap.df.List[[Aligners[2]]], Aligners[2])

profile.heatmap.fn(heatmap.df.List[[Aligners[3]]], Aligners[3])

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 

# Create a data frame storing number of NA genes by type
NA.genes <- lfc.dataframe %>% 
    group_by(Alignment) %>% 
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>% 
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Alignment) %>% 
    mutate(Type=factor(case_when(Type == "zero" ~ type[1],
                                 Type == "outlier" ~ type[2],
                                 Type == "total" ~ type[3]),
                       levels=type))

# Plot number of NA genes 
ggplot(NA.genes, 
       aes(x=Type, y=Number, group=Alignment, fill=Alignment, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

baseMean/LFC/FDR comparison between aligners

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Create an empty list storing significant gene lfc tables 
sigList <- list() 


# Filter significant genes' lfc and save in the list 
for (x in Aligners) {

    sigList[[x]] <- subset(lfcList[[x]], FDR == paste("<", alpha))

}

# Create a vector storing column names
c_names <- colnames(sigList[[1]])[-1] 
c_names <- c("GENEID", 
             paste0(c_names, "_", Aligners[1]), 
             paste0(c_names, "_", Aligners[2]), 
             paste0(c_names, "_", Aligners[3]))


# Join tables by GENEID
lfcTable <- sigList[[1]] %>% 
    inner_join(sigList[[2]], by="GENEID") %>%
    inner_join(sigList[[3]], by="GENEID") 

# Rename the columns
names(lfcTable) <- c_names





# Calculate differences
lfcTable <- lfcTable %>%

    mutate(mean_SA_ST=baseMean_Salmon - baseMean_STAR,
           mean_SA_HI=baseMean_Salmon - baseMean_HISAT2,
           mean_ST_HI=baseMean_STAR - baseMean_HISAT2, 
           lfc_SA_ST=log2FoldChange_Salmon - log2FoldChange_STAR,
           lfc_SA_HI=log2FoldChange_Salmon - log2FoldChange_HISAT2,
           lfc_ST_HI=log2FoldChange_STAR - log2FoldChange_HISAT2,
           FDR_SA_ST=padj_Salmon - padj_STAR,
           FDR_SA_HI=padj_Salmon - padj_HISAT2, 
           FDR_ST_HI=padj_STAR - padj_HISAT2) %>% 

left_join(AnnoDb.ntrans, by="GENEID")


# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {

    vec <- c()

    for (i in 1:num) {
        vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
    }

    return(vec)

}



# Explore the output table
head(lfcTable)
##               GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSMUSG00000000078      3182.57630            -0.2540106   0.07029216
## 2 ENSMUSG00000000085       574.54499            -0.3051097   0.11172615
## 3 ENSMUSG00000000125        32.55512            -1.6815257   0.54687724
## 4 ENSMUSG00000000142       567.90952             0.4233999   0.15963537
## 5 ENSMUSG00000000184     33671.54570            -0.2175147   0.08092099
## 6 ENSMUSG00000000275      4399.69082            -0.2347587   0.08326712
##   stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1   -3.613640  0.0003019283 0.009025866      < 0.1           Salmon
## 2   -2.730871  0.0063167266 0.078616294      < 0.1           Salmon
## 3   -3.074777  0.0021065986 0.035676802      < 0.1           Salmon
## 4    2.652294  0.0079946945 0.092354125      < 0.1           Salmon
## 5   -2.687988  0.0071883891 0.086318131      < 0.1           Salmon
## 6   -2.819344  0.0048121884 0.064485900      < 0.1           Salmon
##   baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR  pvalue_STAR
## 1     3297.0264          -0.2457384 0.06789377 -3.619455 0.0002952245
## 2      609.0892          -0.2974915 0.10151662 -2.930471 0.0033844842
## 3       34.2459          -1.5923501 0.50259104 -3.168282 0.0015334277
## 4      554.3048           0.4313936 0.15684032  2.750527 0.0059499441
## 5    32580.6962          -0.2126708 0.07890771 -2.695184 0.0070349764
## 6     4550.3626          -0.2311471 0.08246804 -2.802869 0.0050650243
##     padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 0.008914826    < 0.1           STAR       3245.3292            -0.2448183
## 2 0.054157532    < 0.1           STAR        586.1192            -0.2902909
## 3 0.029873916    < 0.1           STAR         33.3595            -1.5804287
## 4 0.081968251    < 0.1           STAR        546.0067             0.4288078
## 5 0.092687424    < 0.1           STAR      32115.4478            -0.2137057
## 6 0.072494399    < 0.1           STAR       4473.6172            -0.2329771
##   lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1   0.06846413   -3.575863  0.0003490748  0.01051970      < 0.1
## 2   0.10679988   -2.718082  0.0065661506  0.09105713      < 0.1
## 3   0.50731877   -3.115258  0.0018378428  0.03526692      < 0.1
## 4   0.15500097    2.766485  0.0056664239  0.08250596      < 0.1
## 5   0.07922648   -2.697402  0.0069882806  0.09507324      < 0.1
## 6   0.08238260   -2.827989  0.0046841398  0.07161359      < 0.1
##   Alignment_HISAT2  mean_SA_ST   mean_SA_HI  mean_ST_HI    lfc_SA_ST
## 1           HISAT2 -114.450102  -62.7529172  51.6971844 -0.008272156
## 2           HISAT2  -34.544247  -11.5742133  22.9700335 -0.007618142
## 3           HISAT2   -1.690775   -0.8043754   0.8863992 -0.089175577
## 4           HISAT2   13.604757   21.9027794   8.2980223 -0.007993669
## 5           HISAT2 1090.849492 1556.0979011 465.2484087 -0.004843870
## 6           HISAT2 -150.671758  -73.9264127  76.7453451 -0.003611537
##      lfc_SA_HI     lfc_ST_HI     FDR_SA_ST     FDR_SA_HI     FDR_ST_HI
## 1 -0.009192246 -0.0009200906  0.0001110396 -0.0014938334 -0.0016048730
## 2 -0.014818795 -0.0072006531  0.0244587618 -0.0124408368 -0.0368995986
## 3 -0.101096974 -0.0119213968  0.0058028863  0.0004098853 -0.0053930010
## 4 -0.005407920  0.0025857491  0.0103858733  0.0098481622 -0.0005377112
## 5 -0.003809007  0.0010348636 -0.0063692929 -0.0087551080 -0.0023858151
## 6 -0.001781560  0.0018299765 -0.0080084993 -0.0071276865  0.0008808128
##   num.trans
## 1         3
## 2        14
## 3         1
## 4         7
## 5         8
## 6         3
dim(lfcTable)
## [1] 1199   35
my.param <- c("baseMean", "log2FoldChange", "padj")


# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>% 
    dplyr::select(GENEID, num.trans, starts_with(my.param)) %>% 
    gather(Category, Value, -GENEID, -num.trans) %>% 
    separate(Category, c("Metric", "Aligner"), sep="_") %>% pivot_wider(names_from=Aligner, values_from=Value) %>% 
    group_by(Metric) %>%
    nest() %>% 

    # Calculate correlation coefficients between aligners by metric
    mutate(salmon.star.corr=map_dbl(data, ~ cor(.x$Salmon, .x$STAR)), 
           salmon.hisat.corr=map_dbl(data, ~ cor(.x$Salmon, .x$HISAT2)),
           star.hisat.corr=map_dbl(data, ~ cor(.x$STAR, .x$HISAT2))) 

# Explore the cleaned data frame
lfcTable.comp
## # A tibble: 3 x 5
## # Groups:   Metric [3]
##   Metric       data            salmon.star.corr salmon.hisat.co… star.hisat.corr
##   <chr>        <list>                     <dbl>            <dbl>           <dbl>
## 1 baseMean     <tibble [1,199…            0.980            0.986           0.998
## 2 log2FoldCha… <tibble [1,199…            0.996            0.996           0.999
## 3 padj         <tibble [1,199…            0.855            0.854           0.956
lfcTable.comp$data
## [[1]]
## # A tibble: 1,199 x 5
##    GENEID             num.trans  Salmon    STAR  HISAT2
##    <chr>                  <int>   <dbl>   <dbl>   <dbl>
##  1 ENSMUSG00000000078         3  3183.   3297.   3245. 
##  2 ENSMUSG00000000085        14   575.    609.    586. 
##  3 ENSMUSG00000000125         1    32.6    34.2    33.4
##  4 ENSMUSG00000000142         7   568.    554.    546. 
##  5 ENSMUSG00000000184         8 33672.  32581.  32115. 
##  6 ENSMUSG00000000275         3  4400.   4550.   4474. 
##  7 ENSMUSG00000000278         1  2052.   1976.   1942. 
##  8 ENSMUSG00000000303         3    17.8    18.5    17.9
##  9 ENSMUSG00000000308        10   177.    187.    180. 
## 10 ENSMUSG00000000409         8 16510.  17117.  16661. 
## # … with 1,189 more rows
## 
## [[2]]
## # A tibble: 1,199 x 5
##    GENEID             num.trans Salmon   STAR HISAT2
##    <chr>                  <int>  <dbl>  <dbl>  <dbl>
##  1 ENSMUSG00000000078         3 -0.254 -0.246 -0.245
##  2 ENSMUSG00000000085        14 -0.305 -0.297 -0.290
##  3 ENSMUSG00000000125         1 -1.68  -1.59  -1.58 
##  4 ENSMUSG00000000142         7  0.423  0.431  0.429
##  5 ENSMUSG00000000184         8 -0.218 -0.213 -0.214
##  6 ENSMUSG00000000275         3 -0.235 -0.231 -0.233
##  7 ENSMUSG00000000278         1 -0.383 -0.396 -0.390
##  8 ENSMUSG00000000303         3  1.98   1.97   1.90 
##  9 ENSMUSG00000000308        10 -0.903 -0.866 -0.901
## 10 ENSMUSG00000000409         8  0.214  0.219  0.218
## # … with 1,189 more rows
## 
## [[3]]
## # A tibble: 1,199 x 5
##    GENEID             num.trans   Salmon     STAR   HISAT2
##    <chr>                  <int>    <dbl>    <dbl>    <dbl>
##  1 ENSMUSG00000000078         3 0.00903  0.00891  0.0105  
##  2 ENSMUSG00000000085        14 0.0786   0.0542   0.0911  
##  3 ENSMUSG00000000125         1 0.0357   0.0299   0.0353  
##  4 ENSMUSG00000000142         7 0.0924   0.0820   0.0825  
##  5 ENSMUSG00000000184         8 0.0863   0.0927   0.0951  
##  6 ENSMUSG00000000275         3 0.0645   0.0725   0.0716  
##  7 ENSMUSG00000000278         1 0.000491 0.000205 0.000292
##  8 ENSMUSG00000000303         3 0.0173   0.00697  0.0112  
##  9 ENSMUSG00000000308        10 0.00127  0.000285 0.000396
## 10 ENSMUSG00000000409         8 0.00361  0.00331  0.00509 
## # … with 1,189 more rows
# Prep vectors storing the correlation coefficient without duplication by comparison
Rsquared.salmon.star <- plotlabels.fn(lfcTable.comp$salmon.star.corr, 
                          lfcTable.comp$data, 
                          nrow(lfcTable.comp))

Rsquared.salmon.hisat <- plotlabels.fn(lfcTable.comp$salmon.hisat.corr, 
                            lfcTable.comp$data,
                            nrow(lfcTable.comp))

Rsquared.star.hisat <- plotlabels.fn(lfcTable.comp$star.hisat.corr,
                            lfcTable.comp$data, 
                            nrow(lfcTable.comp))

# Unnest the data frame and add columns storing correlation coefficients each
lfcTable.comp <- lfcTable.comp %>% 
    unnest(data) 

head(lfcTable.comp)
## # A tibble: 6 x 9
## # Groups:   Metric [1]
##   Metric GENEID num.trans Salmon   STAR HISAT2 salmon.star.corr salmon.hisat.co…
##   <chr>  <chr>      <int>  <dbl>  <dbl>  <dbl>            <dbl>            <dbl>
## 1 baseM… ENSMU…         3 3.18e3 3.30e3 3.25e3            0.980            0.986
## 2 baseM… ENSMU…        14 5.75e2 6.09e2 5.86e2            0.980            0.986
## 3 baseM… ENSMU…         1 3.26e1 3.42e1 3.34e1            0.980            0.986
## 4 baseM… ENSMU…         7 5.68e2 5.54e2 5.46e2            0.980            0.986
## 5 baseM… ENSMU…         8 3.37e4 3.26e4 3.21e4            0.980            0.986
## 6 baseM… ENSMU…         3 4.40e3 4.55e3 4.47e3            0.980            0.986
## # … with 1 more variable: star.hisat.corr <dbl>
lfcTable.comp$Rsquared.salmon.star <- Rsquared.salmon.star
lfcTable.comp$Rsquared.salmon.hisat <- Rsquared.salmon.hisat
lfcTable.comp$Rsquared.star.hisat <- Rsquared.star.hisat

# Explore the data frame
head(lfcTable.comp)
## # A tibble: 6 x 12
## # Groups:   Metric [1]
##   Metric GENEID num.trans Salmon   STAR HISAT2 salmon.star.corr salmon.hisat.co…
##   <chr>  <chr>      <int>  <dbl>  <dbl>  <dbl>            <dbl>            <dbl>
## 1 baseM… ENSMU…         3 3.18e3 3.30e3 3.25e3            0.980            0.986
## 2 baseM… ENSMU…        14 5.75e2 6.09e2 5.86e2            0.980            0.986
## 3 baseM… ENSMU…         1 3.26e1 3.42e1 3.34e1            0.980            0.986
## 4 baseM… ENSMU…         7 5.68e2 5.54e2 5.46e2            0.980            0.986
## 5 baseM… ENSMU…         8 3.37e4 3.26e4 3.21e4            0.980            0.986
## 6 baseM… ENSMU…         3 4.40e3 4.55e3 4.47e3            0.980            0.986
## # … with 4 more variables: star.hisat.corr <dbl>, Rsquared.salmon.star <chr>,
## #   Rsquared.salmon.hisat <chr>, Rsquared.star.hisat <chr>
# Nest the data frame by Metric 
lfcTable.comp <- lfcTable.comp %>% 
    group_by(Metric) %>%
    nest()

# Explore the data frame
lfcTable.comp 
## # A tibble: 3 x 2
## # Groups:   Metric [3]
##   Metric         data                 
##   <chr>          <list>               
## 1 baseMean       <tibble [1,199 × 11]>
## 2 log2FoldChange <tibble [1,199 × 11]>
## 3 padj           <tibble [1,199 × 11]>
lfcTable.comp$data
## [[1]]
## # A tibble: 1,199 x 11
##    GENEID      num.trans  Salmon   STAR HISAT2 salmon.star.corr salmon.hisat.co…
##    <chr>           <int>   <dbl>  <dbl>  <dbl>            <dbl>            <dbl>
##  1 ENSMUSG000…         3  3183.  3.30e3 3.25e3            0.980            0.986
##  2 ENSMUSG000…        14   575.  6.09e2 5.86e2            0.980            0.986
##  3 ENSMUSG000…         1    32.6 3.42e1 3.34e1            0.980            0.986
##  4 ENSMUSG000…         7   568.  5.54e2 5.46e2            0.980            0.986
##  5 ENSMUSG000…         8 33672.  3.26e4 3.21e4            0.980            0.986
##  6 ENSMUSG000…         3  4400.  4.55e3 4.47e3            0.980            0.986
##  7 ENSMUSG000…         1  2052.  1.98e3 1.94e3            0.980            0.986
##  8 ENSMUSG000…         3    17.8 1.85e1 1.79e1            0.980            0.986
##  9 ENSMUSG000…        10   177.  1.87e2 1.80e2            0.980            0.986
## 10 ENSMUSG000…         8 16510.  1.71e4 1.67e4            0.980            0.986
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## #   Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## #   Rsquared.star.hisat <chr>
## 
## [[2]]
## # A tibble: 1,199 x 11
##    GENEID       num.trans Salmon   STAR HISAT2 salmon.star.corr salmon.hisat.co…
##    <chr>            <int>  <dbl>  <dbl>  <dbl>            <dbl>            <dbl>
##  1 ENSMUSG0000…         3 -0.254 -0.246 -0.245            0.996            0.996
##  2 ENSMUSG0000…        14 -0.305 -0.297 -0.290            0.996            0.996
##  3 ENSMUSG0000…         1 -1.68  -1.59  -1.58             0.996            0.996
##  4 ENSMUSG0000…         7  0.423  0.431  0.429            0.996            0.996
##  5 ENSMUSG0000…         8 -0.218 -0.213 -0.214            0.996            0.996
##  6 ENSMUSG0000…         3 -0.235 -0.231 -0.233            0.996            0.996
##  7 ENSMUSG0000…         1 -0.383 -0.396 -0.390            0.996            0.996
##  8 ENSMUSG0000…         3  1.98   1.97   1.90             0.996            0.996
##  9 ENSMUSG0000…        10 -0.903 -0.866 -0.901            0.996            0.996
## 10 ENSMUSG0000…         8  0.214  0.219  0.218            0.996            0.996
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## #   Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## #   Rsquared.star.hisat <chr>
## 
## [[3]]
## # A tibble: 1,199 x 11
##    GENEID    num.trans  Salmon    STAR  HISAT2 salmon.star.corr salmon.hisat.co…
##    <chr>         <int>   <dbl>   <dbl>   <dbl>            <dbl>            <dbl>
##  1 ENSMUSG0…         3 9.03e-3 8.91e-3 1.05e-2            0.855            0.854
##  2 ENSMUSG0…        14 7.86e-2 5.42e-2 9.11e-2            0.855            0.854
##  3 ENSMUSG0…         1 3.57e-2 2.99e-2 3.53e-2            0.855            0.854
##  4 ENSMUSG0…         7 9.24e-2 8.20e-2 8.25e-2            0.855            0.854
##  5 ENSMUSG0…         8 8.63e-2 9.27e-2 9.51e-2            0.855            0.854
##  6 ENSMUSG0…         3 6.45e-2 7.25e-2 7.16e-2            0.855            0.854
##  7 ENSMUSG0…         1 4.91e-4 2.05e-4 2.92e-4            0.855            0.854
##  8 ENSMUSG0…         3 1.73e-2 6.97e-3 1.12e-2            0.855            0.854
##  9 ENSMUSG0…        10 1.27e-3 2.85e-4 3.96e-4            0.855            0.854
## 10 ENSMUSG0…         8 3.61e-3 3.31e-3 5.09e-3            0.855            0.854
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## #   Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## #   Rsquared.star.hisat <chr>
# Set a function creating a scatter plot
comp.scatter.fn <- function(df, 
                            xvar, 
                            yvar, 
                            label, 
                            metric,
                            xlab, ylab,
                            xlog=F,
                            ylog=F) {

    p <- ggplot(df, aes(x=xvar, y=yvar, color=log(num.trans), label=label)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        geom_text(size=5, 
                  mapping=aes(x=Inf, y=Inf), 
                  vjust=2, hjust=1.2, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") + 
ggtitle(paste("Comparison in", metric, "\n(with R-squared)")) + scale_color_gradient(low="blue", high="red") +
xlab(xlab) + 
ylab(ylab)

if (xlog) {

    p <- p + 
        scale_x_log10() 
}

if (ylog) { 

    p <- p + scale_y_log10()

}

return(p)
}
# Create and add the scatter plots with or without logarithmic transformation of 
# x/y scales 
lfcTable.comp <- lfcTable.comp %>%

    # Add scatter plots to the data frame:
    #
    # without log transformation of x and y scales
    mutate(salmon.star=map(data, 
                           ~ comp.scatter.fn(.x, .x$Salmon, .x$STAR, 
                                             .x$Rsquared.salmon.star, Metric,
                                             "Salmon", "STAR")),

           salmon.hisat=map(data, ~ comp.scatter.fn(.x,
                                                    .x$Salmon,
                                                    .x$HISAT2, 
                                                    .x$Rsquared.salmon.hisat,
                                                    Metric, 
                                                    "Salmon", "HISAT2")),
           star.hisat=map(data, ~ comp.scatter.fn(.x, 
                                                  .x$STAR, 
                                                  .x$HISAT2, 
                                                  .x$Rsquared.star.hisat,
                                                  Metric,
                                                  "STAR", "HISAT2")),

           # with log transformation of x and y scales
           salmon.star.log=map(data, 
                           ~ comp.scatter.fn(.x, .x$Salmon, .x$STAR, 
                                             .x$Rsquared.salmon.star, Metric, 
                                             "Salmon", "STAR", 
                                             T, T)),

           salmon.hisat.log=map(data, ~ comp.scatter.fn(.x,
                                                    .x$Salmon,
                                                    .x$HISAT2, 
                                                    .x$Rsquared.salmon.hisat, 
                                                    Metric, 
                                                    "Salmon", "HISAT2",
                                                    T, T)),

           star.hisat.log=map(data, ~ comp.scatter.fn(.x, 
                                                  .x$STAR, 
                                                  .x$HISAT2, 
                                                  .x$Rsquared.star.hisat,
                                                  Metric, 
                                                  "STAR", "HISAT2",
                                                  T, T)))


# Set a function simplifying grid.arrange() function
gr.ar.fn0 <- function(pl1, pl2, pl3) {

    grid.arrange(pl1, pl2, pl3, ncol=1) 

}



# Print the plots
for (i in 1:length(my.param)) {

    p1 <- gr.ar.fn0(lfcTable.comp$salmon.star[[i]], 
                lfcTable.comp$salmon.hisat[[i]], 
                lfcTable.comp$star.hisat[[i]]) 

    p2 <- gr.ar.fn0(lfcTable.comp$salmon.star.log[[i]],
                lfcTable.comp$salmon.hisat.log[[i]],
                lfcTable.comp$star.hisat.log[[i]])

    print(grid.arrange(p1, p2, nrow=1)) 

}

## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name            grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]

## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name            grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]

## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name            grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]

Relationship between baseMean/log2FoldChange/padj with the number of alternative transcripts

- Variables of interest: baseMean, log2FoldChange, and padj

- R-squared: tests whether the two variables are correlated

- None of the variable were correlated with the number of alternative transcripts in any of the aligners

- ANOVA: tests whether the the variable is significantly different by Aligner

# Slice the data frame 
lfcTable.rel <- lfcTable %>%
    dplyr::select(starts_with(c(my.param, "num.")))

# Set a function cleaning my data frame
gather.fn <- function(word) {

    gather(lfcTable.rel, Group, Value, starts_with(word)) %>%
        dplyr::select(Group, Value, num.trans)
}

# Set a function creating a boxplot
boxplot1.fn <- function(metric, ylog=F) {

    df <- subset(lfcTable.rela, Metric == metric)

p <- ggplot(df, 
            aes(x= Aligner, y=Value, fill=Aligner, label=ANOVA.pval)) + 
           geom_boxplot(alpha=0.5) + 
           theme_bw() + 
           xlab("Aligner") + 
           ylab("Value") + 
           theme(strip.text.x=element_text(size=10)) +
           ggtitle(paste("Aligner Comparison in", 
                         metric,
                         "\n(P-value from Two-Tailed ANOVA Test)")) + ylab(metric) + 
           geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)

if (ylog) {

    p <- p + scale_y_log10()

}

return(p) 
}




# Clean the data frame 
lfcTable.rela <- rbind(gather.fn("base"), 
                       gather.fn("log2"), 
                       gather.fn("padj")) %>%

separate(Group, c("Metric", "Aligner"), sep="_") %>%

# Nest by Metric
group_by(Metric) %>%
nest() %>%

# Add columns storing: 
# 1. linear regression model 
# 2. summary of the model 
# 3. R-squared extracted from the model 
# 4. ANOVA test 
# 5. p-values extracted from the ANOVA objects
mutate(Model=map(data, ~ lm(Value ~ num.trans, data=.x)), 
       Model_Summary=map(Model, ~ summary(.x)),
       Rsquared=map_dbl(Model_Summary, ~ .x$r.squared), 
       ANOVA=map(data, ~ unlist(summary(aov(Value ~ Aligner, data=.x)))),
       ANOVA.pval=map_dbl(ANOVA, ~ round(.x["Pr(>F)1"], 9)))


# Create a vector storing p-values which will be used in plotting
ANOVA.pval <- plotlabels.fn(lfcTable.rela$ANOVA.pval, 
                            lfcTable.rela$data, 
                            nrow(lfcTable.rela))
# Unnest 
lfcTable.rela <- lfcTable.rela %>% 
    unnest(data) 

# Add a column storing previously created p-value vector
lfcTable.rela$ANOVA.pval <- ANOVA.pval


# Plot 
boxplot1.fn("baseMean", T) 

boxplot1.fn("log2FoldChange", T) 

boxplot1.fn("padj", F)

Difference comparison in baseMean/LFC/FDR between Aligners

# Assign names of the list
col.of.int <- c("Subtraction", "Difference") 

# Slice and clean the data frame
lfcTable.diff <- dplyr::select(lfcTable, 
                               starts_with(c("mean_", 
                                             "lfc_", 
                                             "FDR_"))) %>% 
dplyr::select(-ends_with(c("Salmon", "STAR", "HISAT2")))

# Create an empty data frame 
lfcTable.sub <- data.frame() 

# Clean and save in the data frame
for (x in c("mean_", "lfc_", "FDR_")) {

    df <- lfcTable.diff %>%
    gather(Subtraction, Difference, starts_with(x)) %>%
    dplyr::select(col.of.int) 

if (x == "mean_") {

    lfcTable.sub <- df

} else {

    lfcTable.sub <- rbind(lfcTable.sub, df) 

}
}

# Explore the output 
head(lfcTable.sub)
##   Subtraction  Difference
## 1  mean_SA_ST -114.450102
## 2  mean_SA_ST  -34.544247
## 3  mean_SA_ST   -1.690775
## 4  mean_SA_ST   13.604757
## 5  mean_SA_ST 1090.849492
## 6  mean_SA_ST -150.671758
# Rearrange the data frame 
lfcTable.sub <- lfcTable.sub %>% 
    separate(Subtraction, c("Metric", "From", "To"), sep="_") %>%

mutate(Metric=ifelse(Metric == "mean", "baseMean", 
                      ifelse(Metric == "lfc", "log2FoldChange", "padj")),
       From=ifelse(From == "SA", "Salmon", 
                   ifelse(From == "ST", "STAR", "HISAT2")),
       To=ifelse(To == "SA", "Salmon", 
                 ifelse(To == "ST", "STAR", "HISAT2")),

       Subtraction=paste(From, "-", To)) %>%

# Nest by Metric
group_by(Metric) %>%
nest() 

# Conduct ANOVA test in the subtraction data
lfcTable.sub <- lfcTable.sub %>%
    mutate(ANOVA=map(data, ~aov(Difference ~ Subtraction, data=.x)),
           ANOVA.summary=map(ANOVA, ~ unlist(summary(.x))),
           ANOVA.pval=map_dbl(ANOVA.summary, ~ .x["Pr(>F)1"]))

# Create a vector storing pvalue labels
sub.ANOVA.pval <- plotlabels.fn(lfcTable.sub$ANOVA.pval, 
                                lfcTable.sub$data, 
                                nrow(lfcTable.sub))

# Unnest the data frame 
lfcTable.sub <- lfcTable.sub %>% 
    unnest(data)

# Add a column storing pvalues
lfcTable.sub$ANOVA.pval <- sub.ANOVA.pval

# Set a function creating a box plot
boxplot2.fn <- function(metric, ylog=F) {

    df <- subset(lfcTable.sub, Metric == metric)

p <- ggplot(df, 
            aes(x= Subtraction, y=abs(Difference), 
                fill=Subtraction, label=ANOVA.pval)) + 
           geom_boxplot(alpha=0.5) + 
           theme_bw() + 
           xlab("Subtraction") + 
           ylab("Difference") + 
           theme(strip.text.x=element_text(size=10)) +
           ggtitle(paste("Subtraction Comparison in", 
                         metric,
                         "\n(P-value from Two-Tailed ANOVA Test)")) + 
           geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)

if (ylog) {

    p <- p + scale_y_log10()

}

return(p) 
}


# Print the boxplots comparing subtraction & statistical significance by metric
boxplot2.fn("baseMean", T) 

boxplot2.fn("log2FoldChange", T) 

boxplot2.fn("padj", F) 

Exploration of standard deviation (SD) of baseMean, log2FoldChange, and padj across the alingers

# Create an empty data frame 
lfcTable.sd <- data.frame()

for (x in my.param) {

    # Slice the data frame with columns of interest
    df <- dplyr::select(lfcTable, 
                           starts_with(x)) 

    # Calculate Mean across the aligners
    df$Mean <- rowMeans(df)

    # Calculate SD across the aligners 
    df$SD <- apply(df[, 1:3], 1, sd)

    # Add a column storing the number of alternative transcripts
    df$num.trans <- lfcTable$num.trans 

    # Change column names
    colnames(df) <- str_replace(colnames(df), paste0(x, "_"), "")

    # Add a column storing metric 
    df$Metric <- x

    # Save as a data frame
    if (x == my.param[1]) {

        lfcTable.sd <- df 
        
    } else {

        lfcTable.sd <- rbind(lfcTable.sd, df) 

    }

}


# Explore the output data frame
head(lfcTable.sd)
##        Salmon       STAR     HISAT2        Mean          SD num.trans   Metric
## 1  3182.57630  3297.0264  3245.3292  3241.64397  57.3139792         3 baseMean
## 2   574.54499   609.0892   586.1192   589.91781  17.5826138        14 baseMean
## 3    32.55512    34.2459    33.3595    33.38684   0.8457188         1 baseMean
## 4   567.90952   554.3048   546.0067   556.07367  11.0580162         7 baseMean
## 5 33671.54570 32580.6962 32115.4478 32789.22991 798.7333070         8 baseMean
## 6  4399.69082  4550.3626  4473.6172  4474.55688  75.3402737         3 baseMean
dim(lfcTable.sd)
## [1] 3597    7
# Nest the data frame by Metric
lfcTable.sd <- lfcTable.sd %>% 
    group_by(Metric) %>%
    nest() 


# Explore the output data frame
lfcTable.sd 
## # A tibble: 3 x 2
## # Groups:   Metric [3]
##   Metric         data                
##   <chr>          <list>              
## 1 baseMean       <tibble [1,199 × 6]>
## 2 log2FoldChange <tibble [1,199 × 6]>
## 3 padj           <tibble [1,199 × 6]>
# Set a function creating a scatter plot demonstrating SD over the number of alternative transcripts 
scatter1.fn <- function(df, mlog=F, ylog=F, metric) {

    if (mlog) {

        p <- ggplot(df, aes(x=num.trans, y=SD, color=log(Mean)))

    } else {

        p <- ggplot(df, aes(x=num.trans, y=SD, color=Mean))

    }

    p <- p + 
        geom_point(alpha=0.5) +
        theme_bw() +         
        xlab("Number of Transcripts") + 
ggtitle(paste("Relationship between", 
              metric, 
              "Standard Deviation and\nNumber of Transcripts")) +  
scale_color_gradient(low="blue", high="red")

if (ylog) {

    p <- p + 
        scale_y_log10() + 
        ylab("Log (Standard Deviation)")

} else {

    p <- p + ylab("Standard Deviation") 
}



return(p) 

}


# Create and save the scatter plots
lfcTable.sd <- lfcTable.sd %>%

    # sd.plot = No log transformation in the plot
    # sd.plot.logMean = log(Mean)
    # sd.plot.logY = log(SD), 
    # sd.plot.logBoth = log(Mean) & log(SD)
    mutate(sd.plot=map(data, ~ scatter1.fn(.x, F, F, Metric)),
           sd.plot.logMean=map(data, ~ scatter1.fn(.x, T, F, Metric)),
           sd.plot.logY=map(data, ~ scatter1.fn(.x, F, T, Metric)), 
           sd.plot.logBoth=map(data, ~ scatter1.fn(.x, T, T, Metric)))


# Set a function shortening scripts
gr.ar.fn2 <- function(pl1, pl2, pl3) {

    grid.arrange(pl1, pl2, pl3, nrow=1) 
}

# Explore the plots 
gr.ar.fn2(lfcTable.sd$sd.plot[[1]], 
          lfcTable.sd$sd.plot[[2]], 
          lfcTable.sd$sd.plot[[3]]) 

gr.ar.fn2(lfcTable.sd$sd.plot.logMean[[1]], 
             lfcTable.sd$sd.plot.logMean[[2]], 
             lfcTable.sd$sd.plot.logMean[[3]])

gr.ar.fn2(lfcTable.sd$sd.plot.logY[[1]], 
             lfcTable.sd$sd.plot.logY[[2]],
             lfcTable.sd$sd.plot.logY[[3]])

gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
             lfcTable.sd$sd.plot.logBoth[[2]], 
             lfcTable.sd$sd.plot.logBoth[[3]])

# Print the optimal presentation
gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
             lfcTable.sd$sd.plot.logBoth[[2]], 
             lfcTable.sd$sd.plot[[3]])

Exploring relationship between difference/percent difference vs the number of alternative transcripts

# Add columns storing percent difference
lfcTable <- lfcTable %>%

    # mDiff: difference in meanBase
    # lDiff: difference in log2FoldChange
    # fDiff: difference in padj (FDR)
    mutate(mDiff_percent=100 * mean_SA_ST / (baseMean_Salmon + baseMean_STAR),
           lDiff_percent=100 * lfc_SA_ST / (abs(log2FoldChange_Salmon) + abs(log2FoldChange_STAR)),
           fDiff_percent=100 * FDR_SA_ST / (padj_Salmon + padj_STAR))

# Explore the output
head(lfcTable)
##               GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSMUSG00000000078      3182.57630            -0.2540106   0.07029216
## 2 ENSMUSG00000000085       574.54499            -0.3051097   0.11172615
## 3 ENSMUSG00000000125        32.55512            -1.6815257   0.54687724
## 4 ENSMUSG00000000142       567.90952             0.4233999   0.15963537
## 5 ENSMUSG00000000184     33671.54570            -0.2175147   0.08092099
## 6 ENSMUSG00000000275      4399.69082            -0.2347587   0.08326712
##   stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1   -3.613640  0.0003019283 0.009025866      < 0.1           Salmon
## 2   -2.730871  0.0063167266 0.078616294      < 0.1           Salmon
## 3   -3.074777  0.0021065986 0.035676802      < 0.1           Salmon
## 4    2.652294  0.0079946945 0.092354125      < 0.1           Salmon
## 5   -2.687988  0.0071883891 0.086318131      < 0.1           Salmon
## 6   -2.819344  0.0048121884 0.064485900      < 0.1           Salmon
##   baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR  pvalue_STAR
## 1     3297.0264          -0.2457384 0.06789377 -3.619455 0.0002952245
## 2      609.0892          -0.2974915 0.10151662 -2.930471 0.0033844842
## 3       34.2459          -1.5923501 0.50259104 -3.168282 0.0015334277
## 4      554.3048           0.4313936 0.15684032  2.750527 0.0059499441
## 5    32580.6962          -0.2126708 0.07890771 -2.695184 0.0070349764
## 6     4550.3626          -0.2311471 0.08246804 -2.802869 0.0050650243
##     padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 0.008914826    < 0.1           STAR       3245.3292            -0.2448183
## 2 0.054157532    < 0.1           STAR        586.1192            -0.2902909
## 3 0.029873916    < 0.1           STAR         33.3595            -1.5804287
## 4 0.081968251    < 0.1           STAR        546.0067             0.4288078
## 5 0.092687424    < 0.1           STAR      32115.4478            -0.2137057
## 6 0.072494399    < 0.1           STAR       4473.6172            -0.2329771
##   lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1   0.06846413   -3.575863  0.0003490748  0.01051970      < 0.1
## 2   0.10679988   -2.718082  0.0065661506  0.09105713      < 0.1
## 3   0.50731877   -3.115258  0.0018378428  0.03526692      < 0.1
## 4   0.15500097    2.766485  0.0056664239  0.08250596      < 0.1
## 5   0.07922648   -2.697402  0.0069882806  0.09507324      < 0.1
## 6   0.08238260   -2.827989  0.0046841398  0.07161359      < 0.1
##   Alignment_HISAT2  mean_SA_ST   mean_SA_HI  mean_ST_HI    lfc_SA_ST
## 1           HISAT2 -114.450102  -62.7529172  51.6971844 -0.008272156
## 2           HISAT2  -34.544247  -11.5742133  22.9700335 -0.007618142
## 3           HISAT2   -1.690775   -0.8043754   0.8863992 -0.089175577
## 4           HISAT2   13.604757   21.9027794   8.2980223 -0.007993669
## 5           HISAT2 1090.849492 1556.0979011 465.2484087 -0.004843870
## 6           HISAT2 -150.671758  -73.9264127  76.7453451 -0.003611537
##      lfc_SA_HI     lfc_ST_HI     FDR_SA_ST     FDR_SA_HI     FDR_ST_HI
## 1 -0.009192246 -0.0009200906  0.0001110396 -0.0014938334 -0.0016048730
## 2 -0.014818795 -0.0072006531  0.0244587618 -0.0124408368 -0.0368995986
## 3 -0.101096974 -0.0119213968  0.0058028863  0.0004098853 -0.0053930010
## 4 -0.005407920  0.0025857491  0.0103858733  0.0098481622 -0.0005377112
## 5 -0.003809007  0.0010348636 -0.0063692929 -0.0087551080 -0.0023858151
## 6 -0.001781560  0.0018299765 -0.0080084993 -0.0071276865  0.0008808128
##   num.trans mDiff_percent lDiff_percent fDiff_percent
## 1         3     -1.766314    -1.6552621     0.6189257
## 2        14     -2.918490    -1.2642096    18.4213730
## 3         1     -2.531061    -2.7238534     8.8525137
## 4         7      1.212314    -0.9351579     5.9578544
## 5         8      1.646510    -1.1259958    -3.5581538
## 6         3     -1.683473    -0.7751646    -5.8464607
dim(lfcTable)
## [1] 1199   38
# Create an empty data frame
lfcTable.percent <- data.frame()


for (x in my.param) {

    
    init.vec <- c("GENEID", "num.trans")

    # Set columns of interest
    if (x == my.param[1]) { 

        col.of.interest <- c(init.vec, "mean_SA_ST", "mDiff_percent")

    } else if (x == my.param[2]) {

        col.of.interest <- c(init.vec, "lfc_SA_ST", "lDiff_percent")

    } else {

        col.of.interest <- c(init.vec, "FDR_SA_ST", "fDiff_percent") 

    }

    # Trim the table 
    df <- lfcTable[, colnames(lfcTable) %in% col.of.interest] %>%
        mutate(Metric=x) %>%
        dplyr::rename(Difference=ends_with("ST"), 
                      Percent=ends_with("percent")) 

    # Save in the empty data frame 
    if (x == my.param[1]) {

        lfcTable.percent <- df

    } else {

        lfcTable.percent <- rbind(lfcTable.percent, df)

    }


}

# Explore the output
head(lfcTable.percent)
##               GENEID  Difference num.trans   Percent   Metric
## 1 ENSMUSG00000000078 -114.450102         3 -1.766314 baseMean
## 2 ENSMUSG00000000085  -34.544247        14 -2.918490 baseMean
## 3 ENSMUSG00000000125   -1.690775         1 -2.531061 baseMean
## 4 ENSMUSG00000000142   13.604757         7  1.212314 baseMean
## 5 ENSMUSG00000000184 1090.849492         8  1.646510 baseMean
## 6 ENSMUSG00000000275 -150.671758         3 -1.683473 baseMean
dim(lfcTable.percent)
## [1] 3597    5
# Nest the data frame by Metric
lfcTable.percent <- lfcTable.percent %>%
    group_by(Metric) %>%
    nest()

# Explore the output
lfcTable.percent
## # A tibble: 3 x 2
## # Groups:   Metric [3]
##   Metric         data                
##   <chr>          <list>              
## 1 baseMean       <tibble [1,199 × 4]>
## 2 log2FoldChange <tibble [1,199 × 4]>
## 3 padj           <tibble [1,199 × 4]>
head(lfcTable.percent$data[[1]])
## # A tibble: 6 x 4
##   GENEID             Difference num.trans Percent
##   <chr>                   <dbl>     <int>   <dbl>
## 1 ENSMUSG00000000078    -114.           3   -1.77
## 2 ENSMUSG00000000085     -34.5         14   -2.92
## 3 ENSMUSG00000000125      -1.69         1   -2.53
## 4 ENSMUSG00000000142      13.6          7    1.21
## 5 ENSMUSG00000000184    1091.           8    1.65
## 6 ENSMUSG00000000275    -151.           3   -1.68
# Calculate correlation (Rsquared) 
# DvsP: Percent Difference vs Difference
# DvsT: Difference vs Number of Transcripts
# PvsT: Percent Difference vs Number of Transcripts
lfcTable.percent <- lfcTable.percent %>%
    mutate(DvsP.mod=map(data, ~ lm(Percent ~ Difference, data=.x)), 
           DvsP.rsq=map_dbl(DvsP.mod, ~ summary(.x)$r.squared),
           DvsT.mod=map(data, ~ lm(Difference ~ num.trans, data=.x)),
           DvsT.rsq=map_dbl(DvsT.mod, ~ summary(.x)$r.squared), 
           PvsT.mod=map(data, ~ lm(Percent ~ num.trans, data=.x)), 
           PvsT.rsq=map_dbl(PvsT.mod, ~ summary(.x)$r.squared))

# Add Rsquard values to the data frame as plot label columns
DvsP.r <- plotlabels.fn(lfcTable.percent$DvsP.rsq, 
                        lfcTable.percent$data, 
                        nrow(lfcTable.percent))

DvsT.r <- plotlabels.fn(lfcTable.percent$DvsT.rsq, 
                        lfcTable.percent$data, 
                        nrow(lfcTable.percent))

PvsT.r <- plotlabels.fn(lfcTable.percent$PvsT.rsq, 
                        lfcTable.percent$data, 
                        nrow(lfcTable.percent))

lfcTable.percent <- lfcTable.percent %>%
    unnest(data) 

lfcTable.percent$DvsP.Rsquared <- DvsP.r 
lfcTable.percent$DvsT.Rsquared <- DvsT.r
lfcTable.percent$PvsT.Rsquared <- PvsT.r


# Explore the output data frame
head(lfcTable.percent)
## # A tibble: 6 x 14
## # Groups:   Metric [1]
##   Metric GENEID Difference num.trans Percent DvsP.mod DvsP.rsq DvsT.mod DvsT.rsq
##   <chr>  <chr>       <dbl>     <int>   <dbl> <list>      <dbl> <list>      <dbl>
## 1 baseM… ENSMU…    -114.           3   -1.77 <lm>        0.155 <lm>      1.75e-5
## 2 baseM… ENSMU…     -34.5         14   -2.92 <lm>        0.155 <lm>      1.75e-5
## 3 baseM… ENSMU…      -1.69         1   -2.53 <lm>        0.155 <lm>      1.75e-5
## 4 baseM… ENSMU…      13.6          7    1.21 <lm>        0.155 <lm>      1.75e-5
## 5 baseM… ENSMU…    1091.           8    1.65 <lm>        0.155 <lm>      1.75e-5
## 6 baseM… ENSMU…    -151.           3   -1.68 <lm>        0.155 <lm>      1.75e-5
## # … with 5 more variables: PvsT.mod <list>, PvsT.rsq <dbl>,
## #   DvsP.Rsquared <chr>, DvsT.Rsquared <chr>, PvsT.Rsquared <chr>
# Set a function creating scatter plot
scatter2.fn <- function(label.col, xtitle, ytitle, title, ylog=F, xlog=F, label=F) { 

p <- ggplot(lfcTable.percent) + 
theme_bw() + 
scale_color_gradient(low="blue", high="red") + 
facet_wrap(~Metric, scales = "free") + 
theme(strip.text.x=element_text(size=10)) + 
ggtitle(paste(title, "in Salmon vs STAR")) + 
xlab(xtitle) + 
ylab(ytitle)

if (ylog) {

    p <- p + scale_y_log10() + ylab(paste0("Log (", ytitle, ")")) 

} 

if (xlog) { 

    p <- p + scale_x_log10() + xlab(paste0("Log (", xtitle, ")")) 


}


if (label) {


p <- p + geom_text(data=lfcTable.percent, 
          size=5, 
          mapping=aes(label=label.col, x=Inf, y=Inf), 
          vjust=2, hjust=1.1, color="black") + 
ggtitle(paste(title, "in Salmon vs STAR", "(with R-Squared)"))
}



return(p)
}


# Create and save the plots
# P1 & 2: D vs P with or without log transformation of x values
# P3 & 4: P vs T with or without log transformation of y values
# P5 & 6: D vs T with or without log transformation of y values
p1 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared, 
            "Salmon - STAR", 
            "100 x (Salmon - STAR) / (Salmon + STAR)", 
            "Difference vs Percent Difference", label=T) + 
geom_point(data=lfcTable.percent, alpha=0.5,
           aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))

p2 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared, 
            "Salmon - STAR", 
            "100 x (Salmon - STAR) / (Salmon + STAR)", 
            "Difference vs Percent Difference", 
            xlog=T) + 
geom_point(data=lfcTable.percent, alpha=0.5,
           aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))


p3 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared, 
            "Number of Alternative Transcripts", 
            "100 x (Salmon - STAR) / (Salmon + STAR)", 
            "Percent Difference vs Number of Alternative Transcripts",
            label=T) + 
    geom_point(data=lfcTable.percent, alpha=0.5,
               aes(x=num.trans, y=abs(Percent)))

p4 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared, 
            "Number of Alternative Transcripts", 
            "100 x (Salmon - STAR) / (Salmon + STAR)", 
            "Percent Difference vs Number of Alternative Transcripts",
            ylog=T) + 
    geom_point(data=lfcTable.percent, alpha=0.5,
               aes(x=num.trans, y=abs(Percent)))

p5 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared, 
            "Number of Alternative Transcripts", 
            "Salmon - STAR", 
            "Difference vs Number of Alternative Transcripts", 
            label=T) + 
    geom_point(data=lfcTable.percent, alpha=0.5,
               aes(x=num.trans, y=abs(Difference)))

p6 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared, 
            "Number of Alternative Transcripts", 
            "Salmon - STAR", 
            "Difference vs Number of Alternative Transcripts",
            ylog=T) + 
    geom_point(data=lfcTable.percent, alpha=0.5,
               aes(x=num.trans, y=abs(Difference)))


# Print the plots
grid.arrange(p1, p2, ncol=1)

grid.arrange(p3, p4, ncol=1)

grid.arrange(p5, p6, ncol=1)

Comparison of Ranking between Salmon and STAR in baseMean/log2FoldChange/FDR

# Set a function rearranging a data frame
rank.fn <- function(df, var, desc=F) { 
    
    if (desc) { 

        df <- dplyr::arrange(df, desc(var)) 

    } else {

        df <- dplyr::arrange(df, var) 

    }

    return(df)

}


# Set a function creating a rank plot
rank.plot.fn <- function(df, metric, colog=F) { 

    if (colog) { 

        p <- ggplot(df, aes(x=Rank.x, 
                            y=Rank.y, 
                            color=log(MeanValue), 
                            label=Rsquared))

    } else {

        p <- ggplot(df, aes(x=Rank.x, 
                            y=Rank.y, 
                            color=MeanValue,
                            label=Rsquared))

    }
    
    p <- p + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        scale_color_gradient(low="blue", high="red") + 
        xlab("Salmon") + 
        ylab("STAR") + 
        ggtitle(paste("Rank Comparison in", metric, "\n(with R-Squared)")) + geom_text(size=5, 
                mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.0, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black")  

    return(p)

}
# Clean the data frame 
lfcTable.rank <- lfcTable %>% 

    # Splice by columns of interest
    dplyr::select(GENEID, 
                  starts_with(c("baseMean_", "log2FoldChange_", "padj_")), 
                  -ends_with("HISAT2")) %>% 

# Reform the table
gather(Metric, Value, -GENEID) %>%

# Nest by Metric
group_by(Metric) %>%
nest() %>% 

# Add columns storing ascending or descending order by variable of interest
mutate(Ascending=map(data, ~ rank.fn(.x, .x$Value)), 
       Descending=map(data, ~ rank.fn(.x, .x$Value, T)),
       Final=map2(Ascending, 
                  Descending, 
                  ~ ifelse(Metric %in% c("padj_Salmon", "padj_STAR"),
                           Ascending, 
                           Descending)), 
       Final=map(Final, ~ .x[[1]]),

       # Add a column storing rank
       Final=map(Final, ~ mutate(.x, Rank=1:nrow(.x)))) %>%

# Unnest 
unnest(Final) %>% 

# Reform the table 
separate(Metric, c("Metric", "Aligner")) %>% 
dplyr::select(-data, -Ascending, -Descending) %>%

# Renest by metric
group_by(Metric) %>%
nest() %>%

# Reclean the data frame 
mutate(Split=map(data, ~ split(.x, .x$Aligner)),
       Joined=map(Split, ~ inner_join(.x[[1]], 
                                      .x[[2]], 
                                      by=c("GENEID")))) %>%
dplyr::select(-data, -Split) %>%

# Unnest
unnest(Joined) %>%

# Calculate and save MeanRank and MeanValue
mutate(MeanRank=(Rank.x + Rank.y)/2, 
       MeanValue=abs((Value.x + Value.y)/2)) %>% 


# Nest by Metric
nest(-Metric) %>%

# Add a column storing R^2
mutate(Rsquared=map_dbl(data, ~ cor(.x$Rank.x, .x$Rank.y)))

# Explore the data frame 
head(lfcTable.rank)
## # A tibble: 3 x 3
## # Groups:   Metric [3]
##   Metric         data                 Rsquared
##   <chr>          <list>                  <dbl>
## 1 baseMean       <tibble [1,199 × 9]>    0.993
## 2 log2FoldChange <tibble [1,199 × 9]>    0.999
## 3 padj           <tibble [1,199 × 9]>    0.926
head(lfcTable.rank$data)
## [[1]]
## # A tibble: 1,199 x 9
##    Aligner.x GENEID   Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
##    <chr>     <chr>      <dbl>  <int> <chr>       <dbl>  <int>    <dbl>     <dbl>
##  1 Salmon    ENSMUSG…  72706.      1 STAR       76196.      1      1      74451.
##  2 Salmon    ENSMUSG…  53884.      2 STAR       56130.      2      2      55007.
##  3 Salmon    ENSMUSG…  53308.      3 STAR       56022.      3      3      54665.
##  4 Salmon    ENSMUSG…  46759.      4 STAR       50141.      5      4.5    48450.
##  5 Salmon    ENSMUSG…  43438.      5 STAR       44629.      6      5.5    44033.
##  6 Salmon    ENSMUSG…  39293.      6 STAR       40984.      8      7      40138.
##  7 Salmon    ENSMUSG…  39062.      7 STAR       41583.      7      7      40323.
##  8 Salmon    ENSMUSG…  38475.      8 STAR       40024.      9      8.5    39249.
##  9 Salmon    ENSMUSG…  34056.      9 STAR       35897.     10      9.5    34976.
## 10 Salmon    ENSMUSG…  33672.     10 STAR       32581.     12     11      33126.
## # … with 1,189 more rows
## 
## [[2]]
## # A tibble: 1,199 x 9
##    Aligner.x GENEID   Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
##    <chr>     <chr>      <dbl>  <int> <chr>       <dbl>  <int>    <dbl>     <dbl>
##  1 Salmon    ENSMUSG…    3.82      1 STAR         4.16      1      1        3.99
##  2 Salmon    ENSMUSG…    3.52      2 STAR         3.41      2      2        3.47
##  3 Salmon    ENSMUSG…    3.00      3 STAR         2.81      4      3.5      2.90
##  4 Salmon    ENSMUSG…    2.97      4 STAR         2.63      5      4.5      2.80
##  5 Salmon    ENSMUSG…    2.80      5 STAR         2.91      3      4        2.85
##  6 Salmon    ENSMUSG…    2.64      6 STAR         2.47      8      7        2.56
##  7 Salmon    ENSMUSG…    2.59      7 STAR         2.61      6      6.5      2.60
##  8 Salmon    ENSMUSG…    2.51      8 STAR         2.55      7      7.5      2.53
##  9 Salmon    ENSMUSG…    2.50      9 STAR         2.31     12     10.5      2.41
## 10 Salmon    ENSMUSG…    2.39     10 STAR         2.42      9      9.5      2.41
## # … with 1,189 more rows
## 
## [[3]]
## # A tibble: 1,199 x 9
##    Aligner.x GENEID  Value.x Rank.x Aligner.y  Value.y Rank.y MeanRank MeanValue
##    <chr>     <chr>     <dbl>  <int> <chr>        <dbl>  <int>    <dbl>     <dbl>
##  1 Salmon    ENSMU… 3.34e-87      1 STAR      1.32e-93      1      1    1.67e-87
##  2 Salmon    ENSMU… 1.89e-32      2 STAR      4.06e-39      2      2    9.45e-33
##  3 Salmon    ENSMU… 5.65e-31      3 STAR      2.43e-33      3      3    2.84e-31
##  4 Salmon    ENSMU… 1.25e-30      4 STAR      5.02e-31      4      4    8.74e-31
##  5 Salmon    ENSMU… 3.48e-26      5 STAR      1.98e-26      5      5    2.73e-26
##  6 Salmon    ENSMU… 8.12e-24      6 STAR      6.40e-25      6      6    4.38e-24
##  7 Salmon    ENSMU… 3.55e-21      7 STAR      2.02e-18     13     10    1.01e-18
##  8 Salmon    ENSMU… 5.19e-21      8 STAR      1.96e-24      7      7.5  2.60e-21
##  9 Salmon    ENSMU… 3.83e-20      9 STAR      1.49e-22      8      8.5  1.92e-20
## 10 Salmon    ENSMU… 1.57e-17     10 STAR      7.29e-21      9      9.5  7.85e-18
## # … with 1,189 more rows
# Create a vector storing R^2 values as an input of ggplot2() 
Rsquared <- plotlabels.fn(lfcTable.rank$Rsquared, 
                          lfcTable.rank$data, 
                          nrow(lfcTable.rank))

# Unnest and add a column storing ggplot2's input R^2
lfcTable.rank <- lfcTable.rank %>%
    unnest(data) 

lfcTable.rank$Rsquared <- Rsquared


# Check out the new column added
head(lfcTable.rank)
## # A tibble: 6 x 11
## # Groups:   Metric [1]
##   Metric  Aligner.x GENEID      Value.x Rank.x Aligner.y Value.y Rank.y MeanRank
##   <chr>   <chr>     <chr>         <dbl>  <int> <chr>       <dbl>  <int>    <dbl>
## 1 baseMe… Salmon    ENSMUSG000…  72706.      1 STAR       76196.      1      1  
## 2 baseMe… Salmon    ENSMUSG000…  53884.      2 STAR       56130.      2      2  
## 3 baseMe… Salmon    ENSMUSG000…  53308.      3 STAR       56022.      3      3  
## 4 baseMe… Salmon    ENSMUSG000…  46759.      4 STAR       50141.      5      4.5
## 5 baseMe… Salmon    ENSMUSG000…  43438.      5 STAR       44629.      6      5.5
## 6 baseMe… Salmon    ENSMUSG000…  39293.      6 STAR       40984.      8      7  
## # … with 2 more variables: MeanValue <dbl>, Rsquared <chr>
# Clean the data frame
lfcTable.rank <- lfcTable.rank %>%

    # Nest by metric
    group_by(Metric) %>%
    nest() %>%

    # Add columns storing scatter plots
    mutate(RankPlot=map(data, ~ rank.plot.fn(.x, Metric)), 
           RankPlot.log=map(data, ~ rank.plot.fn(.x, Metric, T)))


# Set a function simplifying grid.arrange() 
gr.ar.fn3 <- function(pl1, pl2) {

    grid.arrange(pl1, pl2, nrow=1)

}


# Explore the plots
gr.ar.fn3(lfcTable.rank$RankPlot[[1]], 
                lfcTable.rank$RankPlot.log[[1]])

gr.ar.fn3(lfcTable.rank$RankPlot[[2]], 
                lfcTable.rank$RankPlot.log[[2]])

gr.ar.fn3(lfcTable.rank$RankPlot[[3]], 
                lfcTable.rank$RankPlot.log[[3]])

# Complete the optimal presentation for the plots
gr.ar.fn2(lfcTable.rank$RankPlot.log[[1]], 
          lfcTable.rank$RankPlot.log[[2]],
          lfcTable.rank$RankPlot[[3]])

Saving difference tables

for (x in my.param) {

    write.csv(subset(lfcTable.rank, Metric == x)$data, 
              paste0(x, "_difference_SA_ST.csv")) 

}

Saving gene rankings

lfcTable.csv <- lfcTable %>%
    dplyr::select(GENEID, starts_with(my.param[-1])) %>% 
    gather(Category, Value, -GENEID) %>%
    separate(Category, c("Metric", "Aligner")) %>% 
    pivot_wider(names_from=Metric, values_from=Value) 

write.csv(lfcTable.csv, "lfcTable.csv") 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3               ensembldb_2.14.0           
##  [3] AnnotationFilter_1.14.0     GenomicFeatures_1.42.1     
##  [5] AnnotationDbi_1.52.0        UpSetR_1.4.0               
##  [7] DESeq2_1.30.1               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [11] matrixStats_0.58.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.0         IRanges_2.24.1             
## [15] S4Vectors_0.28.1            Rsubread_2.4.0             
## [17] tximport_1.18.0             AnnotationHub_2.22.0       
## [19] BiocFileCache_1.14.0        dbplyr_2.1.0               
## [21] BiocGenerics_0.36.0         pheatmap_1.0.12            
## [23] rmarkdown_2.7               forcats_0.5.1              
## [25] stringr_1.4.0               dplyr_1.0.5                
## [27] purrr_0.3.4                 readr_1.4.0                
## [29] tidyr_1.1.3                 tibble_3.1.0               
## [31] ggplot2_3.3.3               tidyverse_1.3.0            
## [33] data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-0              ellipsis_0.3.1               
##   [3] XVector_0.30.0                fs_1.5.0                     
##   [5] rstudioapi_0.13               farver_2.0.3                 
##   [7] bit64_4.0.5                   interactiveDisplayBase_1.28.0
##   [9] fansi_0.4.2                   lubridate_1.7.10             
##  [11] xml2_1.3.2                    splines_4.0.3                
##  [13] cachem_1.0.4                  geneplotter_1.68.0           
##  [15] knitr_1.31                    jsonlite_1.7.2               
##  [17] Rsamtools_2.6.0               broom_0.7.5                  
##  [19] annotate_1.68.0               shiny_1.6.0                  
##  [21] BiocManager_1.30.10           compiler_4.0.3               
##  [23] httr_1.4.2                    backports_1.2.1              
##  [25] lazyeval_0.2.2                assertthat_0.2.1             
##  [27] Matrix_1.3-2                  fastmap_1.1.0                
##  [29] cli_2.3.1                     later_1.1.0.1                
##  [31] prettyunits_1.1.1             htmltools_0.5.1.1            
##  [33] tools_4.0.3                   gtable_0.3.0                 
##  [35] glue_1.4.2                    GenomeInfoDbData_1.2.4       
##  [37] rappdirs_0.3.3                Rcpp_1.0.6                   
##  [39] cellranger_1.1.0              jquerylib_0.1.3              
##  [41] Biostrings_2.58.0             vctrs_0.3.6                  
##  [43] rtracklayer_1.50.0            xfun_0.21                    
##  [45] ps_1.5.0                      rvest_0.3.6                  
##  [47] mime_0.10                     lifecycle_1.0.0              
##  [49] XML_3.99-0.5                  zlibbioc_1.36.0              
##  [51] scales_1.1.1                  ProtGenerics_1.22.0          
##  [53] hms_1.0.0                     promises_1.2.0.1             
##  [55] RColorBrewer_1.1-2            yaml_2.2.1                   
##  [57] curl_4.3                      memoise_2.0.0                
##  [59] sass_0.3.1                    biomaRt_2.46.3               
##  [61] stringi_1.5.3                 RSQLite_2.2.3                
##  [63] highr_0.8                     BiocVersion_3.12.0           
##  [65] genefilter_1.72.1             BiocParallel_1.24.0          
##  [67] rlang_0.4.10                  pkgconfig_2.0.3              
##  [69] bitops_1.0-6                  evaluate_0.14                
##  [71] lattice_0.20-41               labeling_0.4.2               
##  [73] GenomicAlignments_1.26.0      bit_4.0.4                    
##  [75] tidyselect_1.1.0              plyr_1.8.6                   
##  [77] magrittr_2.0.1                R6_2.5.0                     
##  [79] generics_0.1.0                DelayedArray_0.16.0          
##  [81] DBI_1.1.1                     pillar_1.5.0                 
##  [83] haven_2.3.1                   withr_2.4.1                  
##  [85] survival_3.2-7                RCurl_1.98-1.2               
##  [87] modelr_0.1.8                  crayon_1.4.1                 
##  [89] utf8_1.1.4                    progress_1.2.2               
##  [91] locfit_1.5-9.4                grid_4.0.3                   
##  [93] readxl_1.3.1                  blob_1.2.1                   
##  [95] reprex_1.0.0                  digest_0.6.27                
##  [97] xtable_1.8-4                  httpuv_1.5.5                 
##  [99] openssl_1.4.3                 munsell_0.5.0                
## [101] bslib_0.2.4                   askpass_1.1